IMPORT

1. Librairies

library(phyloseq) # for phyloseq object
library(ggplot2)
library(ggsignif)
library(RColorBrewer)
library(cowplot)
library(plyr)
library(dplyr)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap

2. Data

# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"
path.plots <- file.path(path, "data/analysis-individual/PLOTS/plots-Zeber-EDA")

# Import phyloseq object
physeq.zeber <- readRDS(file.path(path, "phyloseq-objects/physeq_zeber.rds"))

# Sanity check
physeq.zeber
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 14812 taxa and 90 samples ]
## sample_data() Sample Data:       [ 90 samples by 10 sample variables ]
## tax_table()   Taxonomy Table:    [ 14812 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 14812 tips and 14810 internal nodes ]
## refseq()      DNAStringSet:      [ 14812 reference sequences ]

Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.

# Look at the tree (each dot is an ASV)
plot_tree(physeq.zeber, color = "Phylum", ladderize="left")

DEMOGRAPHICS

This dataset has covariates on gender Let’s verify the distribution is similar between healthy & IBS groups.

# Number of individuals in each group
metadata <- data.frame(sample_data(physeq.zeber))
metadata %>%
  dplyr::count(host_disease)
# Gender
metadata %>%
  dplyr::count(host_disease, host_sex)
# gender unknown in healthy...

ABUNDANCES

1. Absolute abundances

# Plot Phylum
plot_bar(physeq.zeber, fill = "Phylum") +
  facet_wrap("host_disease", scales="free_x") +
  theme_cowplot()+
  theme(axis.text.x = element_text(size=6, angle=90))+
  labs(x = "Samples", y = "Absolute abundance", title = "Zeber dataset (2016)")

# ggsave(file.path(path.plots, "absAbundance_phylum.jpg"), width=13, height=5)

# Plot Class
plot_bar(physeq.zeber, fill = "Class")+
  facet_wrap("host_disease", scales="free_x") +
  theme_cowplot()+
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Absolute abundance", title = "Zeber dataset (2016)")

Sequencing depth characteristics of the Zeber dataset:
- minimum of 2.654610^{4} total count per sample
- median: 4.8403510^{4} total count per sample
- maximum of 1.4801610^{5} total count per sample

2. Relative abundances

# Agglomerate to phylum & class levels
phylum.table <- physeq.zeber %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

class.table <- physeq.zeber %>%
  tax_glom(taxrank = "Class") %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt()


# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free_x") + # scales = "free" removes empty lines
  geom_bar(stat = "identity", width=0.95) +
  theme_cowplot()+
  theme(axis.text.x = element_text(size=4, angle=90))+
  labs(x = "Samples", y = "Relative abundance", title = "Zeber-Lubecka dataset (2016)")

# ggsave(file.path(path.plots, "relAbundance_phylum.jpg"), width=12, height=5)

ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(class.table[Sample == x & Phylum == 'Firmicutes', 'Abundance'])),
                        y = Abundance, fill = Class))+
  facet_wrap(~ host_disease, scales = "free_x") +
  geom_bar(stat = "identity", width=0.95) +
  theme_cowplot()+
  theme(axis.text.x = element_text(size=4, angle=90))+
  labs(x = "Samples", y = "Relative abundance", title = "Zeber-Lubecka dataset (2016)")

# ggsave(file.path(path.plots, "relAbundance_class.jpg"), width=12, height=5)

3. Firmicutes/Bacteroidota ratio

# Extract abundance of only Bacteroidota and Firmicutes
bacter <- phylum.table %>%
  filter(Phylum == "Bacteroidota") %>%
  select(c('Sample', 'Abundance', 'host_disease', 'host_subtype', 'host_sex', 'Phylum')) %>%
  arrange(Sample)

firmi <- phylum.table %>%
  filter(Phylum == "Firmicutes") %>%
  select(c('Sample', 'Abundance', 'host_disease', 'host_subtype', 'host_sex', 'Phylum')) %>%
  arrange(Sample)

# Calculate log2 ratio Firmicutes/Bacteroidota
ratio.FB <- data.frame('Sample' = bacter$Sample,
                       'host_disease' = bacter$host_disease,
                       'host_subtype' = bacter$host_subtype,
                       'Bacteroidota' = bacter$Abundance,
                       'Firmicutes' = firmi$Abundance,
                       'host_sex' = bacter$host_sex)
ratio.FB$logRatioFB <- log2(ratio.FB$Firmicutes / ratio.FB$Bacteroidota)

# Plot log2 ratio Firmicutes/Bacteroidota
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  geom_signif(comparisons = list(c("Healthy", "IBS")), map_signif_level = TRUE, test="wilcox.test") +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Firmicutes:Bacteroidota ratio")+
  theme_cowplot()+
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB.jpg"), width=4, height=6)

# Statistical test
wilcox.test(ratio.FB[ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$host_disease == "Healthy","logRatioFB"]) # p = 0.97
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 616, p-value = 0.9671
## alternative hypothesis: true location shift is not equal to 0
# Plot log2 ratio Firmicutes/Bacteroidota per IBS subtype
ggplot(ratio.FB, aes(x = host_subtype, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  geom_signif(comparisons = list(c("HC", "IBS-C"), c("HC", "IBS-D"), c("HC", "IBS-M")),
              map_signif_level = TRUE, test="wilcox.test", step_increase=0.1) +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Firmicutes:Bacteroidota ratio")+
  theme_cowplot() +
  theme(legend.position="none")

# ggsave(file.path(path.plots, "ratioFB_subtype.jpg"), width=5, height=7)

# Statistical test
wilcox.test(ratio.FB[ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$host_subtype == "IBS-C","logRatioFB"]) # p = 0.003
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$host_subtype == "IBS-C", "logRatioFB"]
## W = 32, p-value = 0.002913
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$host_subtype == "IBS-D","logRatioFB"]) # p = 0.36
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$host_subtype == "IBS-D", "logRatioFB"]
## W = 316, p-value = 0.3646
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$host_subtype == "HC","logRatioFB"],
            ratio.FB[ratio.FB$host_subtype == "IBS-M","logRatioFB"]) # p = 0.64
## 
##  Wilcoxon rank sum exact test
## 
## data:  ratio.FB[ratio.FB$host_subtype == "HC", "logRatioFB"] and ratio.FB[ratio.FB$host_subtype == "IBS-M", "logRatioFB"]
## W = 277, p-value = 0.6374
## alternative hypothesis: true location shift is not equal to 0

NORMALIZE DATA

# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.zeber)<500) # all FALSE

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.zeber
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts

# Sanity check that 0 values have been replaced
# otu_table(physeq.zeber)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]

# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1

# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/Zeber-2016/02_EDA-Zeber/physeq_NZcomp.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.zeber
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) )

# check the counts are all relative
# otu_table(physeq.zeber)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]

# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/Zeber-2016/02_EDA-Zeber/physeq_relative.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.zeber
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )

# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total

# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/Zeber-2016/02_EDA-Zeber/physeq_CSN.rds"))


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.zeber
physeq.clr <- microbiome::transform(physeq.zeber, "clr") # the function adds pseudocounts itself

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
# otu_table(physeq.zeber)[1:5, 1:5] # should contain absolute counts
# otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/Zeber-2016/02_EDA-Zeber/physeq_clr.rds"))

COMPUTE DISTANCES

1. UniFrac, Aitchison, Bray-Curtis and Canberra

First, let’s look at these four distances of interest.

#____________________________________________________________________________________
# Measure distances
getDistances <- function(){
  set.seed(123) # for unifrac, need to set a seed
  glom.UniF <- UniFrac(physeq.rel, weighted=TRUE, normalized=TRUE) # weighted unifrac
  glom.ait <- phyloseq::distance(physeq.clr, method = 'euclidean') # aitchison
  glom.bray <- phyloseq::distance(physeq.CSN, method = "bray") # bray-curtis
  glom.can <- phyloseq::distance(physeq.NZcomp, method = "canberra") # canberra
  dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
  
  return(dist.list)
}


#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS"){
  plist <- NULL
  plist <- vector("list", 4)
  names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
  
  print("Unifrac")
  # Weighted UniFrac
  set.seed(123)
  iMDS.UniF <- ordinate(physeq.rel, ordination, distance=dlist$UniF)
  plist[[1]] <- plot_ordination(physeq.rel, iMDS.UniF, color="host_disease")
  
  print("Aitchison")
  # Aitchison
  set.seed(123)
  iMDS.Ait <- ordinate(physeq.clr, ordination, distance=dlist$Ait)
  plist[[2]] <- plot_ordination(physeq.clr, iMDS.Ait, color="host_disease")
  
  print("Bray")
  # Bray-Curtis
  set.seed(123)
  iMDS.Bray <- ordinate(physeq.CSN, ordination, distance=dlist$Bray)
  plist[[3]] <- plot_ordination(physeq.CSN, iMDS.Bray, color="host_disease")
  
  print("Canberra")
  # Canberra
  set.seed(123)
  iMDS.Can <- ordinate(physeq.NZcomp, ordination, distance=dlist$Can)
  plist[[4]] <- plot_ordination(physeq.NZcomp, iMDS.Can, color="host_disease")
  
  # Creating a dataframe to plot everything
  plot.df = ldply(plist, function(x) x$data)
  names(plot.df)[1] <- "distance"
  
  return(plot.df)
}

Now let’s plot!

# Get the distances & the plot data
dist.zeber <- getDistances()
plot.df <- plotDistances2D(dist.zeber)
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
a <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5) + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease")

b <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_subtype))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = scales::alpha(c("blue", "brown", "red", "orange"), .3))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank())+
  labs(color="Subtype")

ggpubr::ggarrange(a,b, nrow=2)

# ggsave(file.path(path.plots, "distances4_MDS.jpg"), height = 7, width = 14)

2. Plot in 3D

For better visualization, we will also take a glance at reduction to 3D.

#____________________________________________________________________________________
# Plot 3D ordination
plotDistances3D <- function(d, name_dist){
  
  # Reset parameters
  mds.3D <- NULL
  xyz <- NULL
  fig.3D <- NULL
  
  # Reduce distance matrix to 3 dimensions
  set.seed(123)
  mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
  xyz <- scores(mds.3D, display="sites") # pull out the (x,y,z) coordinates
  
  # Plot
  fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
                    color=sample_data(physeq.zeber)$host_disease, colors = c("blue", "red"))%>%
    layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
  
  return(fig.3D)
}

Now let’s plot!

plotDistances3D(dist.zeber$UniF, "UniFrac")
plotDistances3D(dist.zeber$Ait, "Aitchison")
plotDistances3D(dist.zeber$Canb, "Canberra")
plotDistances3D(dist.zeber$Bray, "Bray-Curtis")

HIERARCHICAL CLUSTERING

# For heatmaps: have group color
matcol <- data.frame(group = sample_data(physeq.zeber)[,"host_disease"],
                     host_subtype = sample_data(physeq.zeber)[,"host_subtype"])


# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
  
  # Initialize variables
  i=1
  plist <- vector("list", 4)
  names(plist) <- names(dlist)
  
  # Loop through distances
  for(d in dlist){
    plist[[i]] <- pheatmap(as.matrix(d), 
                          clustering_distance_rows = d,
                          clustering_distance_cols = d,
                          fontsize = fontsize,
                          fontsize_col = fontsize-5,
                          fontsize_row = fontsize-5,
                          annotation_col = matcol,
                          annotation_row = matcol,
                          annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   host_subtype = c('HC'='black', 'IBS-M'='orange', 'IBS-C'='brown', 'IBS-D'='red')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', # hc method
                          main = names(dlist)[i]) # have name of distance as title
    i <- i+1
  }
  
  return(plist)
}


# Get the heatmaps
heatmp.zeber <- plotHeatmaps(dlist = dist.zeber, fontsize = 8)

REPRODUCE PLOTS FROM PAPER

Figure 1 - The phylogenetic tree of bacteria detected in samples

library(trac)
library(treeio)
library(ggtree)

# Function to obtain taxonomic tree information
phyloseq_to_treedata <- function(physeq){
  
  ## ***********************
  ## 1 - GET TAXONOMIC TABLE
  tax <- physeq@tax_table@.Data[,1:6]
  # Add a "Life" column (rank0)
  tax <- cbind("Life", tax)
  colnames(tax)[1] <- "Life"
  # tax[1:5,1:7]
  
  
  ## ***********************
  ## 2 - GET TAXONOMIC TABLE WITH CUMULATIVE LABELS
  full_tax <- tax
  for (i in 2:7) {
    full_tax[, i] <- paste(full_tax[, i-1], full_tax[, i], sep = "::")
  }
  full_tax <- as.data.frame(full_tax, stringsAsFactors = TRUE)
  # full_tax[1:5,1:7]
  
  
  ## ***********************
  ## 3 - GET PHYLO OBJECT
  tree.phylo <- tax_table_to_phylo(~Life/Kingdom/Phylum/Class/Order/Family/Genus,
                                   data = full_tax, collapse = TRUE)
  
  
  ## ***********************
  ## 4 - GET TREEDATA OBJECT
  
  # We will define new labels for the tips
  d <- data.frame(label=full_tax$Genus,
                  new_label=tax[,"Genus"],
                  phylum=tax[,"Phylum"])
  d[!d$phylum %in% c("Actinobacteriota", "Bacteroidota", "Firmicutes", "Fusobacteriota", "Proteobacteria", "Verrucomicrobiota"), "phylum"] <- "Other"
  
  # Create a treedata object, that will contain the new_label
  tree.td <- full_join(tree.phylo, d, by="label")
  
  # Get the list of tip labels belonging to each phylum (phyla are the names of the elements in the list)
  l <- list()
  for (phylum in unique(d$phylum)){
    print(phylum)
    l[[phylum]] <- d[d$phylum==phylum,"label"]
  }
  
  # Group the nodes and edges by phylum (to color the branches by phylum)
  tree.td <- groupOTU(tree.td, l)
  
  
  ## ***********************
  ## 5 - RETURN TREEDATA OBJECT
  return(tree.td)
}


# Colors
colors <- c("#3690c0", "#006d2c", "#ef3b2c", "#c994c7", "#3f007d", "#fee391", "#d9d9d9")
names(colors) <- c("Actinobacteriota", "Bacteroidota", "Firmicutes", "Fusobacteriota", "Proteobacteria", "Verrucomicrobiota", "Other")

# Agglomerate to genus level
physeq.glom <- tax_glom(physeq.zeber, "Genus")
# Keep only genera representing >1% of reads
totalreads <- sum(otu_table(physeq.zeber))
physeq.glom <- filter_taxa(physeq.glom, function(x) sum(x) > (0.01*totalreads), TRUE)
# length(unique(tax_table(physeq.glom)[,"Genus"])) # only 16 genera left

# Get the tree
tree <- phyloseq_to_treedata(physeq=physeq.glom)
## [1] "Bacteroidota"
## [1] "Firmicutes"
## [1] "Proteobacteria"
# Plot the tree
ggtree(tree, layout="circular", aes(color=group), lwd=0.4)+
  geom_tiplab(aes(label=new_label), size=2, color="black")+
  geom_nodepoint()+
  scale_color_manual(values=colors)+
  labs(color="Phylum")

Figure 1. The phylogenetic tree of bacteria detected in samples. Only genera present in more than 1% of reads are shown. A

Figure 2 - Bacteroidetes/Firmicutes ratios

# Calculate Bacteroidota/Firmicutes ratio
ratio.FB$logRatioBF <- log2(ratio.FB$Bacteroidota / ratio.FB$Firmicutes)


ggplot(ratio.FB, aes(x = host_subtype, y = logRatioBF))+
  geom_boxplot(aes(fill=host_subtype))+
  scale_fill_manual(values=scales::alpha(c("#006d2c", "#6a51a3", "#0570b0", "#dd3497"), .3))+
  labs(x = "",  y = '', title = "Bacteroidota/Firmicutes ratio")+
  theme_cowplot() +
  theme(legend.position="none")

Figure 2 Bacteroidetes/Firmicutes ratios in the healthy control (HC), IBS-C, IBS-D, and IBS-M groups.

Figure 4 - Box plots of the Simpson index

plot_richness(physeq.zeber, x="host_disease", measures="Simpson", color="host_disease") +
  geom_boxplot(fill=NA, width=0.3) +
  scale_color_manual(values=c("#006d2c", "#fec44f"))+
  theme_bw() +
  labs(x="", y="Simpson")

Figure 4. Box plots of the Simpson index of community diversity in healthy controls (HCs) and IBS patients before (IBS) and after (Treat- ment) treatment.